{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "import matplotlib as plt\n",
    "import numpy as np\n",
    "\n",
    "from autotagger.stackoverflow.preprocess import load_pickle_sklearn_format\n",
    "from sklearn import cross_validation,linear_model\n",
    "from sklearn.datasets import make_multilabel_classification\n",
    "from sklearn.metrics import f1_score, precision_score, recall_score\n",
    "from sklearn.multiclass import OneVsRestClassifier\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "X,Y = load_pickle_sklearn_format(\"1GB_100_features\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((166811, 100), (667245, 100), (166811, 100), (667245, 100))"
      ]
     },
     "execution_count": 4,
     "output_type": "execute_result",
     "metadata": {}
    }
   ],
   "source": [
    "X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X,Y,test_size=0.80, random_state=42)\n",
    "X_train.shape, X_test.shape, Y_train.shape, Y_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf = linear_model.LinearRegression()\n",
    "meta_clf = OneVsRestClassifier(clf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "OneVsRestClassifier(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),\n",
       "          n_jobs=1)"
      ]
     },
     "execution_count": 7,
     "output_type": "execute_result",
     "metadata": {}
    }
   ],
   "source": [
    "meta_clf.fit(X_train,Y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y_pred = meta_clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/felipe/auto-tagger/venv3/lib/python3.4/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.018744593095539479"
      ]
     },
     "execution_count": 9,
     "output_type": "execute_result",
     "metadata": {}
    }
   ],
   "source": [
    "# macro average refers to the average f1_score for each label\n",
    "f1_score(Y_test,Y_pred, average='macro')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/felipe/auto-tagger/venv3/lib/python3.4/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.23430741369424352"
      ]
     },
     "execution_count": 11,
     "output_type": "execute_result",
     "metadata": {}
    }
   ],
   "source": [
    "# if we just consider the labels that have had at least one instance predicted,\n",
    "# our score goes up:\n",
    "\n",
    "label_scores = f1_score(Y_test,Y_pred,average=None)\n",
    "valid_label_indices = np.nonzero(label_scores)[0]\n",
    "f1_score(Y_test,Y_pred,average='macro',labels=valid_label_indices)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.13040891760528536"
      ]
     },
     "execution_count": 12,
     "output_type": "execute_result",
     "metadata": {}
    }
   ],
   "source": [
    "# micro average refers to the average f1_score for each instance\n",
    "f1_score(Y_test,Y_pred,average='micro')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    ""
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3.0
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}